Data Science¶

Group 17¶

Daniel Lucas
Ricardo Nobre
Ricardo Carvalho
Diogo Torneiro
Vasco Pombo

In [5]:
#!pip install matplotlib seaborn mlxtend plotly
In [6]:
#basic libraries: numpy and pandasfor data handling, pyplot 
#and seaborn for visualization, math for mathematical operations
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
from datetime import date
from scipy.stats import chi2_contingency
from sklearn.preprocessing import MinMaxScaler
import scipy.stats as stats
import plotly.express as px

#dataset partition
from sklearn.model_selection import train_test_split

#feature selection methods
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import RFE
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

#scaling methods and categorical variable encoder
from sklearn.preprocessing import RobustScaler, OneHotEncoder

#model selection 
from sklearn import model_selection
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV

#linear models
from sklearn.linear_model import LogisticRegression, SGDClassifier

#gaussian naive bayes
from sklearn.naive_bayes import GaussianNB

#decision tree classifier
from sklearn.tree import DecisionTreeClassifier

#k-neighbors classifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import KNNImputer

#principal component analysis
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

#neural network
from sklearn.neural_network import MLPClassifier

#ensemble classifier models
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier, \
VotingClassifier, AdaBoostClassifier, StackingClassifier, HistGradientBoostingClassifier, \
ExtraTreesClassifier

#support vector machines
from sklearn.svm import SVC

#model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
make_scorer, classification_report, confusion_matrix, f1_score

from itertools import combinations
from collections import Counter

Importing Data¶

In [7]:
train_data_original = pd.read_csv('train.csv')
test_data_original = pd.read_csv('test.csv')

df = train_data_original.copy()
df_test = test_data_original.copy()

Data Understanding¶

In [8]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15589 entries, 0 to 15588
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Cust_ID            15589 non-null  int64  
 1   Churn              15589 non-null  object 
 2   Name               15589 non-null  object 
 3   Longevity          15589 non-null  object 
 4   Year_Birth         15394 non-null  float64
 5   TypeTravel         15589 non-null  object 
 6   RoomType           15589 non-null  object 
 7   RewardPoints       15589 non-null  int64  
 8   Comfort            15589 non-null  int64  
 9   ReceptionSchedule  15589 non-null  int64  
 10  FoodDrink          15589 non-null  int64  
 11  Location           15589 non-null  int64  
 12  Wifi               15589 non-null  int64  
 13  Amenities          15589 non-null  int64  
 14  Staff              15589 non-null  int64  
 15  OnlineBooking      15589 non-null  int64  
 16  PriceQuality       15589 non-null  int64  
 17  RoomSpace          15589 non-null  int64  
 18  CheckOut           15589 non-null  int64  
 19  Checkin            15589 non-null  int64  
 20  Cleanliness        15589 non-null  int64  
 21  BarService         15589 non-null  int64  
dtypes: float64(1), int64(16), object(5)
memory usage: 2.6+ MB
In [9]:
df_original_row_size = df.shape[0]
df_original_columns_size = df.shape[1]

print("Dataframe has", df_original_row_size, "rows and", df_original_columns_size, "columns")
Dataframe has 15589 rows and 22 columns
In [10]:
df.describe().T
Out[10]:
count mean std min 25% 50% 75% max
Cust_ID 15589.0 7795.000000 4500.301008 1.0 3898.0 7795.0 11692.0 15589.0
Year_Birth 15394.0 1981.706444 15.179042 1936.0 1970.0 1981.0 1994.0 2014.0
RewardPoints 15589.0 5022.593816 1027.962379 409.0 4445.0 5088.0 5649.0 6950.0
Comfort 15589.0 2.841619 1.388624 0.0 2.0 3.0 4.0 5.0
ReceptionSchedule 15589.0 2.997242 1.518994 0.0 2.0 3.0 4.0 5.0
FoodDrink 15589.0 2.844570 1.436948 0.0 2.0 3.0 4.0 5.0
Location 15589.0 2.986016 1.299438 1.0 2.0 3.0 4.0 5.0
Wifi 15589.0 3.245109 1.327026 0.0 2.0 3.0 4.0 6.0
Amenities 15589.0 3.374816 1.352417 0.0 2.0 4.0 4.0 5.0
Staff 15589.0 3.506383 1.319565 1.0 3.0 4.0 5.0 5.0
OnlineBooking 15589.0 3.454231 1.310343 0.0 2.0 4.0 5.0 5.0
PriceQuality 15589.0 3.459683 1.268130 1.0 3.0 4.0 4.0 5.0
RoomSpace 15589.0 3.470845 1.293873 0.0 2.0 4.0 5.0 5.0
CheckOut 15589.0 3.700558 1.158644 1.0 3.0 4.0 5.0 5.0
Checkin 15589.0 3.327282 1.266872 1.0 3.0 3.0 4.0 5.0
Cleanliness 15589.0 3.692347 1.154437 1.0 3.0 4.0 5.0 5.0
BarService 15589.0 3.347360 1.300452 0.0 2.0 3.0 4.0 5.0
In [11]:
check = df.copy()

Categorical Variables¶

In [12]:
check['Gender'] = [1 if i == 'Mr.' else 0 for i in check["Name"].str[:3]]
In [13]:
f, axes = plt.subplots(2,2, figsize=(20, 15), squeeze=False)   

sns.countplot(x='RoomType', hue='Churn', data=check, color='darkseagreen', ax=axes[0, 0])
sns.countplot(x='TypeTravel', hue='Churn', data=check, color='tan', ax=axes[0, 1])
sns.countplot(x='Longevity', hue='Churn', data=check, color='cadetblue', ax=axes[1, 0])
sns.countplot(x='Gender', hue='Churn', data=check, color='dimgrey', ax=axes[1, 1])
Out[13]:
<AxesSubplot:xlabel='Gender', ylabel='count'>

Numerical Variables¶

In [14]:
#Reward Points
fig = px.box(data_frame=check, x='Churn', y='RewardPoints') 
fig.show()
In [15]:
#Year Birth
fig = px.box(data_frame=check, x='Churn', y='Year_Birth')
fig.show()
In [16]:
#Comfort
fig = px.box(data_frame=check, x='Churn', y='Comfort')
fig.show()
In [17]:
#ReceptionSchedule
fig = px.box(data_frame=check, x='Churn', y='ReceptionSchedule')
fig.show()
In [18]:
#FoodDrink
fig = px.box(data_frame=check, x='Churn', y='FoodDrink')
fig.show()
In [19]:
#Location
fig = px.box(data_frame=check, x='Churn', y='Location')
fig.show()
In [20]:
#Wifi
fig = px.box(data_frame=check, x='Churn', y='Wifi') 
fig.show()
In [21]:
#Amenities
fig = px.box(data_frame=check, x='Churn', y='Amenities')
fig.show()
In [22]:
#Staff
fig = px.box(data_frame=check, x='Churn', y='Staff') 
fig.show()
In [23]:
#Online Booking
fig = px.box(data_frame=check, x='Churn', y='OnlineBooking')
fig.show()
In [24]:
#Price Quality
fig = px.box(data_frame=check, x='Churn', y='PriceQuality')
fig.show()
In [25]:
#Room Space
fig = px.box(data_frame=check, x='Churn', y='RoomSpace')
fig.show()
In [26]:
#Check Out
fig = px.box(data_frame=check, x='Churn', y='CheckOut')
fig.show()
In [27]:
#Checkin
fig = px.box(data_frame=check, x='Churn', y='Checkin')
fig.show()
In [28]:
#Cleanliness
fig = px.box(data_frame=check, x='Churn', y='Cleanliness')
fig.show()
In [29]:
#Bar Service
fig = px.box(data_frame=check, x='Churn', y='BarService')
fig.show()

Skewness¶

In [30]:
df.skew().sort_values()
Out[30]:
CheckOut            -0.750689
Cleanliness         -0.745131
Amenities           -0.599498
Staff               -0.554561
PriceQuality        -0.503381
RoomSpace           -0.482952
OnlineBooking       -0.472074
RewardPoints        -0.453779
Checkin             -0.382588
BarService          -0.358297
ReceptionSchedule   -0.260705
Wifi                -0.171255
FoodDrink           -0.123610
Comfort             -0.100907
Location            -0.050229
Year_Birth          -0.003847
Cust_ID              0.000000
dtype: float64

Kurtosis¶

In [31]:
df.kurt().sort_values()
Out[31]:
Cust_ID             -1.200000
Wifi                -1.111251
ReceptionSchedule   -1.077243
Location            -1.076583
FoodDrink           -0.967047
BarService          -0.949765
Comfort             -0.935544
OnlineBooking       -0.931811
RoomSpace           -0.864644
Staff               -0.858889
Checkin             -0.812149
PriceQuality        -0.772663
Year_Birth          -0.729800
Amenities           -0.540657
Cleanliness         -0.225183
CheckOut            -0.225016
RewardPoints         0.260135
dtype: float64

Missing Values¶

In [32]:
df.isna().sum()
Out[32]:
Cust_ID                0
Churn                  0
Name                   0
Longevity              0
Year_Birth           195
TypeTravel             0
RoomType               0
RewardPoints           0
Comfort                0
ReceptionSchedule      0
FoodDrink              0
Location               0
Wifi                   0
Amenities              0
Staff                  0
OnlineBooking          0
PriceQuality           0
RoomSpace              0
CheckOut               0
Checkin                0
Cleanliness            0
BarService             0
dtype: int64

Pre Processing¶

In [33]:
# Change the data frame index to the customer ID of each entry, rather than the standard index
df.set_index("Cust_ID", inplace = True)
df_test.set_index("Cust_ID", inplace = True)

Coherence Checks¶

Churn¶

In [34]:
df['Churn'].value_counts()

sns.countplot(df["Churn"], color="skyblue")
plt.show()

Churn is our target variable and it looks somewhat evenly distributed. We will still test with over and under sampled datasets but there is not a huge descrepancy between the values of Churn.

Name¶

In [35]:
df['Name'].value_counts().nlargest(1000)
Out[35]:
Mr. Michael Smith      9
Ms. Amanda Smith       7
Mr. John Smith         7
Mr. Michael Jones      6
Mr. William Smith      6
                      ..
Ms. Elizabeth Adams    2
Mr. Daniel Hall        2
Mr. William Wright     2
Ms. Sandra Lopez       2
Mr. Joseph Martinez    2
Name: Name, Length: 1000, dtype: int64

Longevity¶

In [36]:
df['Longevity'].value_counts()
Out[36]:
yes    12548
no      2874
y        167
Name: Longevity, dtype: int64

Variable does not look good because we have entries with 'yes' and entries with 'y' which most likely mean the same. Will will transform the entries with 'y' into 'yes'

In [37]:
df['Longevity'].replace('y','yes', inplace=True)
df['Longevity'].value_counts()

sns.countplot(df["Longevity"], color="skyblue")
plt.show()

Year Birth¶

In [38]:
df['Year_Birth'].value_counts()
Out[38]:
1982.0    441
1996.0    416
1980.0    383
1998.0    380
1981.0    369
         ... 
1947.0      9
1945.0      8
1943.0      6
1942.0      3
1936.0      2
Name: Year_Birth, Length: 75, dtype: int64
In [39]:
sns.histplot(df["Year_Birth"], color="skyblue")
Out[39]:
<AxesSubplot:xlabel='Year_Birth', ylabel='Count'>

Type Travel¶

In [40]:
df['TypeTravel'].value_counts()
Out[40]:
business    10756
leisure      4833
Name: TypeTravel, dtype: int64
In [41]:
sns.countplot(df["TypeTravel"], color="skyblue")
Out[41]:
<AxesSubplot:xlabel='TypeTravel', ylabel='count'>

Room Type¶

In [42]:
df['RoomType'].value_counts()
Out[42]:
single    7442
double    7021
suite     1126
Name: RoomType, dtype: int64
In [43]:
sns.countplot(df["RoomType"], color="skyblue")
Out[43]:
<AxesSubplot:xlabel='RoomType', ylabel='count'>

Ratings variables¶

In [44]:
f, axes = plt.subplots(3,5, figsize=(20, 15), squeeze=False)  
sns.histplot(df["Comfort"], color="skyblue", ax=axes[0, 0])
sns.histplot(df["ReceptionSchedule"], color="skyblue", ax=axes[0, 1])
sns.histplot(df["FoodDrink"], color="skyblue", ax=axes[0, 2])
sns.histplot(df["Location"], color="skyblue", ax=axes[0, 3])
sns.histplot(df["Wifi"], color="skyblue", ax=axes[0, 4])
sns.histplot(df["Amenities"], color="skyblue", ax=axes[1, 0])
sns.histplot(df["Staff"], color="skyblue", ax=axes[1, 1])
sns.histplot(df["OnlineBooking"], color="skyblue", ax=axes[1, 2])
sns.histplot(df["PriceQuality"], color="skyblue", ax=axes[1, 3])
sns.histplot(df["RoomSpace"], color="skyblue", ax=axes[1, 4])
sns.histplot(df["CheckOut"], color="skyblue", ax=axes[2, 0])
sns.histplot(df["Checkin"], color="skyblue", ax=axes[2, 1])
sns.histplot(df["Cleanliness"], color="skyblue", ax=axes[2, 2])
sns.histplot(df["BarService"], color="skyblue", ax=axes[2, 3])

plt.show()

Upon looking at the charts, we noticed that 'Wifi' has values that should not be there, in particular, some surveys were marked with 6 which is not an available rating.

Since there are only 36 rows with value 6, we will convert them to a 5.

In [45]:
df['Wifi'].replace(6,5, inplace=True)

sns.histplot(df["Wifi"], color="skyblue")
plt.show()

Duplicate Values¶

In [46]:
#create duplicate datafranme
duplicates = df.duplicated()

#visualize the duplicates
df[duplicates]
Out[46]:
Churn Name Longevity Year_Birth TypeTravel RoomType RewardPoints Comfort ReceptionSchedule FoodDrink ... Wifi Amenities Staff OnlineBooking PriceQuality RoomSpace CheckOut Checkin Cleanliness BarService
Cust_ID
8196 nochurn Ms. Abigail York yes 1995.0 leisure double 5098 5 5 5 ... 4 5 5 3 3 4 3 3 3 5
9177 churn Ms. Abigail Kennedy yes 1991.0 business suite 5932 3 3 2 ... 3 3 3 3 4 1 4 3 4 3
9418 nochurn Ms. Abigail Buchanan yes 1972.0 business double 6769 5 4 4 ... 5 5 4 5 5 5 5 2 5 1

3 rows × 21 columns

In [47]:
df.drop_duplicates(inplace = True)
df.shape
Out[47]:
(15586, 21)

Outliers¶

In [48]:
f, axes = plt.subplots(4,4, figsize=(20, 15), squeeze=False)    
sns.boxplot(df["Year_Birth"], ax=axes[0, 0])
sns.boxplot(df["RewardPoints"], ax=axes[0, 1])
sns.boxplot(df["Comfort"], ax=axes[0, 2])
sns.boxplot(df["ReceptionSchedule"], ax=axes[0, 3])
sns.boxplot(df["FoodDrink"], ax=axes[1, 0])
sns.boxplot(df["Location"],  ax=axes[1, 1])
sns.boxplot(df["Wifi"],  ax=axes[1, 2])
sns.boxplot(df["Amenities"],  ax=axes[1, 3])
sns.boxplot(df["Staff"],  ax=axes[2, 0])
sns.boxplot(df["OnlineBooking"],  ax=axes[2, 1])
sns.boxplot(df["PriceQuality"],  ax=axes[2, 2])
sns.boxplot(df["RoomSpace"], ax=axes[2, 3])
sns.boxplot(df["CheckOut"], ax=axes[3, 0])
sns.boxplot(df["Checkin"], ax=axes[3, 1])
sns.boxplot(df["Cleanliness"], ax=axes[3, 2])
sns.boxplot(df["BarService"], ax=axes[3, 3])

plt.show()
In [49]:
#method to return the boundaries of IQR 
def get_IQR_bounds(s):
    q1 = s.quantile(0.25)
    q3 = s.quantile(0.75)
    
    iqr = q3 - q1
    
    lower_bound = q1 -(1.5 * iqr)
    upper_bound = q3 +(1.5 * iqr)
    
    return (lower_bound,upper_bound)

Reward Points¶

In [50]:
df = df[df['RewardPoints'] > get_IQR_bounds(df['RewardPoints'])[0]]

print("There are been",df_original_row_size - df.shape[0], "outliers removed")
There are been 293 outliers removed
In [51]:
sns.histplot(df["RewardPoints"], color="skyblue")
plt.show()

Ratings Variables¶

After some research and analysis on the subject, we decided not to remove outliers on the two ratings variables that presented outliers in the boxplot visualization. Those are 'PriceQuality' and 'Checkin'. The reason we did not remove them was because they were too many entries that would have to be removed, and if we did, we would effectively be shortening the rating scale for the rating data.

Instead, we will look for straight lining on the ratings entries, meaning people who answered all the answers with the same variable, which can mean they were in a rush and decided to fill the survey as fast as they could.

Straight Lining¶

In [52]:
straight_lining_entries = df[(df['Comfort'] == df['ReceptionSchedule']) & (df['Comfort'] == df['FoodDrink']) & (df['Comfort'] == df['Location']) & (df['Comfort'] == df['Wifi']) & (df['Comfort'] == df['Amenities']) & (df['Comfort'] == df['Staff']) & (df['Comfort'] == df['OnlineBooking']) & (df['Comfort'] == df['PriceQuality']) & (df['Comfort'] == df['RoomSpace']) & (df['Comfort'] == df['CheckOut']) & (df['Comfort'] == df['Checkin']) & (df['Comfort'] == df['Cleanliness']) & (df['Comfort'] == df['BarService'])]
straight_lining_entries
Out[52]:
Churn Name Longevity Year_Birth TypeTravel RoomType RewardPoints Comfort ReceptionSchedule FoodDrink ... Wifi Amenities Staff OnlineBooking PriceQuality RoomSpace CheckOut Checkin Cleanliness BarService
Cust_ID
1549 nochurn Mr. Marcus Burns yes 1981.0 business single 5289 4 4 4 ... 4 4 4 4 4 4 4 4 4 4
15260 nochurn Mr. Randy Robbins yes 1976.0 business single 6699 5 5 5 ... 5 5 5 5 5 5 5 5 5 5

2 rows × 21 columns

In [53]:
df.drop(straight_lining_entries.index, axis=0, inplace=True)

We found 2 entries that are considered straight lining answers and we will remove them from the data

Feature Engineering¶

In [54]:
#churn = 1, nochurn = 0
df['Churn'] = [1 if i == 'churn' else 0 for i in df["Churn"]]
In [55]:
df_data = df.drop(['Churn'], axis=1).copy()
df_target = df['Churn'].copy()
In [56]:
#Method to transform the data

def transform_data(X, age=False):
        print()
        # get all numerical variables
        X_train_num = X[num_vars]
        #
        # get all categorical variables
        X_train_cat = X[cat_vars]
        #df_scaled['Longevity'] = [1 if i == 'yes' else 0 for i in df_scaled["Longevity"]]
        #df_scaled['TypeTravel'] = [1 if i == 'business' else 0 for i in df_scaled["TypeTravel"]]
        
        # fill missing values (KNN Imputer for Year of Birth: first variable of numerical variables)
        k_imputer = round(np.sqrt(len(X_train_num)),0).astype('int32') # 125
        imputer = KNNImputer(n_neighbors=k_imputer, weights="uniform", metric='nan_euclidean')
        imputer.fit(X_train_num)
        data_KNN_train = imputer.transform(X_train_num)
        data_KNN_train = pd.DataFrame(data_KNN_train)
        data_KNN_train[0] = data_KNN_train[0].round(0)
        X_train_num['Year_Birth'] = data_KNN_train[0].values
        if age:
            X_train_num['Age'] = date.today().year - X_train_num['Year_Birth']
            
        
        # Apply scaling to numerical data
        scaler = MinMaxScaler().fit(X_train_num)
        X_train_scaled = pd.DataFrame(scaler.transform(X_train_num), columns = X_train_num.columns, index = X_train_num.index,) # MinMaxScaler in the training data
        
        return X_train_scaled, X_train_cat
In [57]:
#Gender Variable

df_data["Characters"] = df_data["Name"].str[:3]
df_data['Characters'].unique()

df_data.drop(columns=["Characters"], axis=1, inplace=True)
df_data['Gender'] = ['Male' if i == 'Mr.' else 'Female' for i in df_data["Name"].str[:3]]

df_data.drop(columns=['Name'], inplace=True)
In [58]:
cat_vars = ['Gender', 'Longevity', 'TypeTravel', 'RoomType']
num_vars = list(set(df_data.columns) - set(cat_vars))

Feature Selection¶

In [55]:
def select_best_features(X, y, splits, age=False):
    sel_log = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    sel_tree = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    sel_forest = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    sel_lasso = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    
    if age:
        sel_log.append(0)
        sel_tree.append(0)
        sel_forest.append(0)
        sel_lasso.append(0)
        
    skf = StratifiedKFold(n_splits = splits, shuffle=True, random_state=42)
    counter = 0
    for train_index, val_index in skf.split(X, y):
        counter +=1
        print('')
        print('--------------------------------------------------------')
        print('SPLIT ', counter)
        print('--------------------------------------------------------')
        print('')
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        
        X_train_scaled, X_train_cat = transform_data(X_train, age)
        X_val_scaled, X_val_cat = transform_data(X_val, age)

        # Check which features to use using RFE and Logistic Regression
        print('')
        print('----------------- RFE ----------------------')
        model = LogisticRegression()
        rfe = RFE(estimator = model, n_features_to_select = 5)
        X_rfe = rfe.fit_transform(X = X_train_scaled, y = y_train)
        selected_features = pd.Series(rfe.support_, index = X_train_scaled.columns)
        sel_log=np.add(sel_log,list(map(int, selected_features)))

         # Check which features to use using RFE and decision tree
        model = DecisionTreeClassifier()
        rfe = RFE(estimator = model, n_features_to_select = 5)
        X_rfe = rfe.fit_transform(X = X_train_scaled, y = y_train)
        selected_features = pd.Series(rfe.support_, index = X_train_scaled.columns)
        sel_tree=np.add(sel_tree,list(map(int, selected_features)))
        
        # Check which features to use using RFE and random forest
        model = RandomForestClassifier()
        rfe = RFE(estimator = model, n_features_to_select = 5)
        X_rfe = rfe.fit_transform(X = X_train_scaled, y = y_train)
        selected_features = pd.Series(rfe.support_, index = X_train_scaled.columns)
        sel_forest=np.add(sel_forest,list(map(int, selected_features)))

        #Lasso
        reg = LassoCV()
        reg.fit(X=X_train_scaled, y=y_train.replace('nochurn',0).replace('churn',1))
        print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
        print("Best score using built-in LassoCV: %f" %reg.score(X = X_train_scaled,y = y_train.replace('nochurn',0).replace('churn',1)))
        coef = pd.Series(reg.coef_, index = X_train_scaled.columns)
        sel_lasso=np.add(sel_lasso,coef.values)
        
        # Check which features to use using Chi-Square
        print('')
        print('----------------- CHI-SQUARE ----------------------')
        def TestIndependence(X,y,var,alpha=0.05):        
            dfObserved = pd.crosstab(y,X) 
            chi2, p, dof, expected = stats.chi2_contingency(dfObserved.values)
            dfExpected = pd.DataFrame(expected, columns=dfObserved.columns, index = dfObserved.index)
            if p<alpha:
                result="{0} is IMPORTANT for Prediction".format(var)
            else:
                result="{0} is NOT important for Prediction. (Discard {0} from model)".format(var)
            print(result)
        
        for var in X_train_cat:
            TestIndependence(X_train_cat[var],y_train, var)
        
        sel_lasso = sel_lasso/splits
        
        if age==True:
            num_vars.append("Age")
         
        final = pd.DataFrame(np.array([sel_log,sel_tree,sel_forest,sel_lasso]),\
                         columns=set(num_vars),index=['Logistic Regression','Decision Tree','Random Forest','Lasso'])
                             
    print(final.T)
In [59]:
select_best_features(df_data, df_target, 5)
--------------------------------------------------------
SPLIT  1
--------------------------------------------------------




----------------- RFE ----------------------
Best alpha using built-in LassoCV: 0.000070
Best score using built-in LassoCV: 0.408180

----------------- CHI-SQUARE ----------------------
Gender is IMPORTANT for Prediction
Longevity is IMPORTANT for Prediction
TypeTravel is IMPORTANT for Prediction
RoomType is IMPORTANT for Prediction

--------------------------------------------------------
SPLIT  2
--------------------------------------------------------




----------------- RFE ----------------------
Best alpha using built-in LassoCV: 0.000080
Best score using built-in LassoCV: 0.411028

----------------- CHI-SQUARE ----------------------
Gender is IMPORTANT for Prediction
Longevity is IMPORTANT for Prediction
TypeTravel is IMPORTANT for Prediction
RoomType is IMPORTANT for Prediction

--------------------------------------------------------
SPLIT  3
--------------------------------------------------------




----------------- RFE ----------------------
Best alpha using built-in LassoCV: 0.000113
Best score using built-in LassoCV: 0.403751

----------------- CHI-SQUARE ----------------------
Gender is IMPORTANT for Prediction
Longevity is IMPORTANT for Prediction
TypeTravel is IMPORTANT for Prediction
RoomType is IMPORTANT for Prediction

--------------------------------------------------------
SPLIT  4
--------------------------------------------------------




----------------- RFE ----------------------
Best alpha using built-in LassoCV: 0.000185
Best score using built-in LassoCV: 0.410113

----------------- CHI-SQUARE ----------------------
Gender is IMPORTANT for Prediction
Longevity is IMPORTANT for Prediction
TypeTravel is IMPORTANT for Prediction
RoomType is IMPORTANT for Prediction

--------------------------------------------------------
SPLIT  5
--------------------------------------------------------




----------------- RFE ----------------------
Best alpha using built-in LassoCV: 0.000158
Best score using built-in LassoCV: 0.405094

----------------- CHI-SQUARE ----------------------
Gender is IMPORTANT for Prediction
Longevity is IMPORTANT for Prediction
TypeTravel is IMPORTANT for Prediction
RoomType is IMPORTANT for Prediction
                   Logistic Regression  Decision Tree  Random Forest     Lasso
Year_Birth                         0.0            5.0            5.0  0.020899
Location                           0.0            1.0            0.0 -0.015053
ReceptionSchedule                  5.0            5.0            5.0 -0.165488
Checkin                            4.0            0.0            0.0 -0.038071
Amenities                          1.0            0.0            0.0 -0.029079
CheckOut                           5.0            4.0            5.0 -0.067565
BarService                         0.0            5.0            5.0 -0.005781
Cleanliness                        5.0            0.0            0.0 -0.049719
Staff                              5.0            0.0            0.0 -0.057884
OnlineBooking                      0.0            0.0            0.0  0.034264
RewardPoints                       0.0            0.0            0.0 -0.000308
Wifi                               0.0            0.0            0.0 -0.018957
FoodDrink                          0.0            0.0            0.0 -0.000017
Comfort                            0.0            0.0            0.0  0.015966
PriceQuality                       0.0            0.0            0.0  0.028827
RoomSpace                          0.0            5.0            5.0 -0.033102
In [56]:
def cor_heatmap(cor):
    plt.figure(figsize=(12,10))
    sns.heatmap(data = cor, annot = True,  fmt='.1')
    plt.show()
In [57]:
cor_heatmap(df_data.corr(method = 'spearman'))
Predictor RFE Logistic RFE Decision Tree RFE Random Forest Lasso Correlation Include in the model?
Year_Birth Keep Keep Keep Keep? Keep Keep
RewardPoints Discard Discard Discard Discard Keep Discard
Comfort Discard Discard Discard Discard Discard Discard
ReceptionSchedule Keep? Discard Keep Keep? Keep Discard
FoodDrink Keep Discard Discard Keep Keep Keep
Location Keep Discard Discard Keep Keep Keep
Wifi Discard Discard Discard Keep Keep Discard
Amenities Discard Keep? Discard Keep? Keep Keep?
Staff Discard Keep Discard Keep? Discard Discard
OnlineBooking Discard Keep Keep Discard Discard Discard
PriceQuality Keep Discard Discard Keep? Keep Discard
RoomSpace Keep Keep Keep Keep Keep Keep
CheckOut Discard Keep Keep Keep? Keep Keep?
CheckIn Discard Discard Discard Keep? Keep Discard
Cleanliness Discard Discard Discard Keep? Keep Discard
BarService Discard Discard Discard Discard Discard Discard

Numerical variables to keep¶

  • Year_birth
  • FoodDrink
  • Location
  • Amenities
  • RoomSpace
  • CheckOut

Categorical variables to keep¶

From the Chi-Square, every single variable was marked as important

In [59]:
selected_cat = ['Longevity','TypeTravel','RoomType','Gender']
selected_num = ['Year_Birth', 'FoodDrink', 'Location', 'Amenities', 'RoomSpace', 'CheckOut']
In [60]:
features_to_drop = ['RewardPoints','Comfort','ReceptionSchedule','Wifi','Staff','OnlineBooking','PriceQuality',
                    'Checkin','Cleanliness','BarService']
In [61]:
df_data.drop(columns = features_to_drop, inplace = True, axis=1)

Creating Dummies¶

In [62]:
#yes = 1, no = 0
df_data['Longevity'] = [1 if i == 'yes' else 0 for i in df_data["Longevity"]]

#business = 1, leisure = 0
df_data['TypeTravel'] = [1 if i == 'business' else 0 for i in df_data["TypeTravel"]]

#Male = 1, Female = 0
df_data['Gender'] = [1 if i == 'Male' else 0 for i in df_data["Gender"]]

df_data = pd.get_dummies(df_data, drop_first = True)

Scaling Data¶

In [63]:
scaler = MinMaxScaler()
df_data = pd.DataFrame(scaler.fit_transform(df_data), index=df_data.index, columns = df_data.columns)

Modelling¶

In [64]:
def compare_models(X, y, model):
    # apply StratifiedK-Fold
    skf = StratifiedKFold(n_splits = 5)
    score_train = []
    score_val = []
    for train_index, val_index in skf.split(X, y):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        # This time we are going to use validation to check overfitting 
        # so we need also to make all the needed changes in the validation
        
        # fill missing values (mean in numerical data, mode in categorical data)
        #median_age_train = X_train['age'].median() # age is no longer used
        #X_train['age'].fillna(median_age_train, inplace = True)
        #X_val['age'].fillna(median_age_train, inplace = True)
        k_imputer = round(np.sqrt(len(X_train[selected_num])),0).astype('int32') # 125
        imputer = KNNImputer(n_neighbors=k_imputer, weights="uniform", metric='nan_euclidean')
        imputer.fit(X_train[selected_num])
        data_KNN_train = imputer.transform(X_train[selected_num])
        data_KNN_train = pd.DataFrame(data_KNN_train)
        data_KNN_train[0] = data_KNN_train[0].round(0)
        X_train['Year_Birth'] = data_KNN_train[0].values
        # Use Train Imputer for Validation Data
        data_KNN_val = imputer.transform(X_val[selected_num])
        data_KNN_val = pd.DataFrame(data_KNN_val)
        data_KNN_val[0] = data_KNN_val[0].round(0)
        X_val['Year_Birth'] = data_KNN_val[0].values
        
        
        # Data Scaling
        # Apply MinMaxScaler
        #scaler = MinMaxScaler().fit(X_train[selected_num])
        #X_train_scaled = scaler.transform(X_train[selected_num]) 
        #X_val_scaled = scaler.transform(X_val[selected_num]) # Scaling with 'scaler' from train data

        # Apply model
        model.fit(X_train, y_train)
        predictions_train = model.predict(X_train)
        predictions_val = model.predict(X_val)
        score_train.append(f1_score(y_train, predictions_train))
        score_val.append(f1_score(y_val, predictions_val))

    avg_train = round(np.mean(score_train),3)
    avg_val = round(np.mean(score_val),3)
    std_train = round(np.std(score_train),2)
    std_val = round(np.std(score_val),2)

    return str(avg_train) + '+/-' + str(std_train),str(avg_val) + '+/-' + str(std_val)
In [65]:
def show_results(df, X, y, *args):
    """
    Receive an empty dataframe and the different models and call the function avg_score
    """
    count = 0
    # for each model passed as argument
    for arg in args:
        # obtain the results provided by avg_score
        avg_train, avg_test = compare_models(X, y, arg)
        # store the results in the right row
        df.iloc[count] = avg_train, avg_test
        count+=1
    
    return df
In [71]:
model_LR = LogisticRegression()
model_KNN = KNeighborsClassifier()
model_GB = GradientBoostingClassifier()
model_HGB = HistGradientBoostingClassifier()
model_AB = AdaBoostClassifier()
model_SVC = SVC()
model_G = GaussianNB()
model_ET = ExtraTreesClassifier()
model_RF = RandomForestClassifier()
model_DT = DecisionTreeClassifier()
model_MLP = MLPClassifier()

df = pd.DataFrame(columns = ['Train','Validation'], index = 
                  ['Logistic Regression','KNN', 'GradientBoost',
                   'HistGradientBoost', 'AdaBoost', 
                  'SVC', 'Gaussian', 'ExtraTrees',
                  'RandomForest', 'DecisonTree',
                  'MLPClassifier'])
show_results(df, df_data, df_target, 
             model_LR, model_KNN, model_GB, 
             model_HGB, model_AB, model_SVC,
             model_G, model_ET, model_RF,
             model_DT, model_MLP)
Out[71]:
Train Validation
Logistic Regression 0.787+/-0.0 0.787+/-0.01
KNN 0.896+/-0.0 0.855+/-0.0
GradientBoost 0.87+/-0.0 0.866+/-0.0
HistGradientBoost 0.9+/-0.0 0.882+/-0.0
AdaBoost 0.826+/-0.0 0.826+/-0.01
SVC 0.875+/-0.0 0.869+/-0.0
Gaussian 0.756+/-0.0 0.756+/-0.01
ExtraTrees 0.951+/-0.0 0.86+/-0.0
RandomForest 0.952+/-0.0 0.868+/-0.0
DecisonTree 0.951+/-0.0 0.839+/-0.01
MLPClassifier 0.882+/-0.0 0.873+/-0.01

Modelling and Predictions¶

In [66]:
#First we must transform the test dataset to be able to be used in modeling

#drop variables
df_test.drop(columns=features_to_drop, inplace = True)

#feature enginering
df_test["Characters"] = df_test["Name"].str[:3]
df_test['Characters'].unique()

df_test.drop(columns=["Characters"], axis=1, inplace=True)
df_test['Gender'] = ['Male' if i == 'Mr.' else 'Female' for i in df_test["Name"].str[:3]]

df_test.drop(columns=['Name'], inplace=True)

#dummies
#yes = 1, no = 0
df_test['Longevity'] = [1 if i == 'yes' else 0 for i in df_test["Longevity"]]

#business = 1, leisure = 0
df_test['TypeTravel'] = [1 if i == 'business' else 0 for i in df_test["TypeTravel"]]

#Male = 1, Female = 0
df_test['Gender'] = [1 if i == 'Male' else 0 for i in df_test["Gender"]]

df_test = pd.get_dummies(df_test, drop_first = True)

#scaling
scaler = MinMaxScaler()
df_test = pd.DataFrame(scaler.fit_transform(df_test), index=df_test.index, columns = df_test.columns)
In [67]:
def score(y_val,y_pred):
    '''Shows the micro f score, then a complete analysis,
    with precision, recall, f1-score, and support, for both
    training and validation sets, after that the accuracy, and
    finally the base and weighted averages
    
    Requires: the target from the validation dataset 
    and the corresponding prediction
    '''
    print('Micro f1 score:', f1_score(y_val, y_pred, average='micro'))
    #calculating the micro f1 score
    print('\nResults on the data set:')
    print(classification_report(y_true = y_val, y_pred = y_pred))
In [68]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2)
imputer_matrix = imputer.fit_transform(df_data)

data_cleaned = pd.DataFrame(data=imputer_matrix, columns = df_data.columns, index = df_data.index)
df_data['Year_Birth'] = data_cleaned['Year_Birth']
In [69]:
X_train, X_val, y_train, y_val = train_test_split(df_data, df_target, test_size=0.3, random_state=123)

Gradient Boosting¶

In [70]:
clf =  GradientBoostingClassifier()

print('-------MODEL WITH THE SELECTED FEATURES-------')
scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring='f1')
print("Score on train: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std()))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
score(y_val,y_pred)
-------MODEL WITH THE SELECTED FEATURES-------
Score on train: 0.86945 (+/- 0.01)
Micro f1 score: 0.8842885160165613

Results on the data set:
              precision    recall  f1-score   support

           0       0.89      0.90      0.89      2492
           1       0.88      0.86      0.87      2097

    accuracy                           0.88      4589
   macro avg       0.88      0.88      0.88      4589
weighted avg       0.88      0.88      0.88      4589

In [71]:
 def search_cv_gb(x_train, y_train, x_val, y_val):
     '''Determines the optimal parameters for a random forest classifier for
     the input data, and prints the optimal random forest classifier, a full
     list of its parameters, and the cross-validation scores for both
     training and validation data
    
     Requires: train and validation data, both features and target in both cases
     '''
     model=GradientBoostingClassifier(random_state=15)
     # grid search - find best parameters
     parameters = {'n_estimators':[10,50,100,200],
     'max_depth':[5,10],
     'min_samples_split':[3,7],
     'min_samples_leaf':[1,2],
     'max_features':['auto','log2',None]}
                  
     clf = GridSearchCV(model, param_grid=parameters)
     grid_search = clf.fit(x_train, y_train)
     # mark result
     print("Best score: %0.3f" % grid_search.best_score_)
     print(grid_search.best_estimator_)

     # best prarams
     print('best prarams:', clf.best_params_)

     print('-----grid search end------------')
     print('on all train set')
     scores = cross_val_score(grid_search.best_estimator_, x_train, y_train, cv=3, scoring='f1')
     print(scores.mean(), scores)
     print('on test set')
     scores = cross_val_score(grid_search.best_estimator_, x_val, y_val, cv=3, scoring='f1')
     print(scores.mean(), scores)
In [72]:
#search_cv_gb(X_train, y_train, X_val, y_val)
Best score: 0.896
GradientBoostingClassifier(max_depth=10, max_features='log2',
                           min_samples_split=3, n_estimators=50,
                           random_state=15)
best prarams: {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 50}
-----grid search end------------
on all train set
0.8832254334479576 [0.88063823 0.89194139 0.87709668]
on test set
0.8642061670774267 [0.85507246 0.8702509  0.86729514]

Random Forest¶

In [79]:
clf =  RandomForestClassifier(random_state=15)

print('-------MODEL WITH THE SELECTED FEATURES-------')
scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring='f1')
print("Score on train: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std()))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
score(y_val,y_pred)
-------MODEL WITH THE SELECTED FEATURES-------
Score on train: 0.88110 (+/- 0.00)
Micro f1 score: 0.8860318152102854

Results on the data set:
              precision    recall  f1-score   support

           0       0.89      0.90      0.90      2492
           1       0.88      0.87      0.87      2097

    accuracy                           0.89      4589
   macro avg       0.89      0.88      0.89      4589
weighted avg       0.89      0.89      0.89      4589

In [80]:
 def search_cv_rf(x_train, y_train, x_val, y_val):
     '''Determines the optimal parameters for a random forest classifier for
     the input data, and prints the optimal random forest classifier, a full
     list of its parameters, and the cross-validation scores for both
     training and validation data
    
     Requires: train and validation data, both features and target in both cases
     '''
     model=RandomForestClassifier(random_state=15)
     # grid search - find best parameters
     parameters = {'n_estimators':[10,50,100,200],
     'criterion':['gini','entropy'],
     'max_depth':[5,10],
     'min_samples_split':[3,7],
     'min_samples_leaf':[1,2],
     'max_features':['auto','log2',None]}
                  
     clf = GridSearchCV(model, param_grid=parameters)
     grid_search = clf.fit(x_train, y_train)
     # mark result
     print("Best score: %0.3f" % grid_search.best_score_)
     print(grid_search.best_estimator_)

     # best prarams
     print('best prarams:', clf.best_params_)

     print('-----grid search end------------')
     print('on all train set')
     scores = cross_val_score(grid_search.best_estimator_, x_train, y_train, cv=3, scoring='f1')
     print(scores.mean(), scores)
     print('on test set')
     scores = cross_val_score(grid_search.best_estimator_, x_val, y_val, cv=3, scoring='f1')
     print(scores.mean(), scores)
In [ ]:
#search_cv_rf(X_train, y_train, X_val, y_val)
In [68]:
clf =  RandomForestClassifier(criterion= 'entropy', max_depth= 10, max_features= 'auto', min_samples_leaf= 1, min_samples_split= 3, n_estimators= 200)

print('-------MODEL WITH THE SELECTED FEATURES-------')
scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring='f1')
print("Score on train: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std()))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
score(y_val,y_pred)
-------MODEL WITH THE SELECTED FEATURES-------
Score on train: 0.88124 (+/- 0.01)
Micro f1 score: 0.8932229243843974

Results on the data set:
              precision    recall  f1-score   support

           0       0.89      0.91      0.90      2492
           1       0.89      0.87      0.88      2097

    accuracy                           0.89      4589
   macro avg       0.89      0.89      0.89      4589
weighted avg       0.89      0.89      0.89      4589

MLP¶

In [83]:
clf =  MLPClassifier(random_state=15)

print('-------MODEL WITH THE SELECTED FEATURES-------')
scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring='f1')
print("Score on train: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std()))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
score(y_val,y_pred)
-------MODEL WITH THE SELECTED FEATURES-------
Score on train: 0.86579 (+/- 0.01)
Micro f1 score: 0.8842885160165613

Results on the data set:
              precision    recall  f1-score   support

           0       0.88      0.91      0.90      2492
           1       0.89      0.85      0.87      2097

    accuracy                           0.88      4589
   macro avg       0.89      0.88      0.88      4589
weighted avg       0.88      0.88      0.88      4589

In [84]:
 def search_mlp(x_train, y_train, x_val, y_val):
     '''Determines the optimal parameters for a mlp  classifier for
     the input data, and prints the optimal mlp classifier, a full
     list of its parameters, and the cross-validation scores for both
     training and validation data
    
     Requires: train and validation data, both features and target in both cases
     '''
     model=MLPClassifier(random_state=15)
     # grid search - find best parameters
     parameters = {'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],}
                  
     clf = GridSearchCV(model, param_grid=parameters)
     grid_search = clf.fit(x_train, y_train)
     # mark result
     print("Best score: %0.3f" % grid_search.best_score_)
     print(grid_search.best_estimator_)

     # best prarams
     print('best prarams:', clf.best_params_)

     print('-----grid search end------------')
     print('on all train set')
     scores = cross_val_score(grid_search.best_estimator_, x_train, y_train, cv=3, scoring='f1')
     print(scores.mean(), scores)
     print('on test set')
     scores = cross_val_score(grid_search.best_estimator_, x_val, y_val, cv=3, scoring='f1')
     print(scores.mean(), scores)
In [69]:
clf =  MLPClassifier(activation= 'tanh', alpha= 0.05, hidden_layer_sizes= (50, 100, 50), learning_rate= 'constant', solver= 'adam')

print('-------MODEL WITH THE SELECTED FEATURES-------')
scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring='f1')
print("Score on train: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std()))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
score(y_val,y_pred)
-------MODEL WITH THE SELECTED FEATURES-------
Score on train: 0.87091 (+/- 0.01)
Micro f1 score: 0.8805840052298975

Results on the data set:
              precision    recall  f1-score   support

           0       0.87      0.91      0.89      2492
           1       0.89      0.84      0.87      2097

    accuracy                           0.88      4589
   macro avg       0.88      0.88      0.88      4589
weighted avg       0.88      0.88      0.88      4589

KNN¶

In [70]:
clf = KNeighborsClassifier()

print('-------MODEL WITH THE SELECTED FEATURES-------')
scores = model_selection.cross_val_score(clf, X_train, y_train, cv=3, scoring='f1')
print("Score on train: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std()))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
score(y_val,y_pred)
-------MODEL WITH THE SELECTED FEATURES-------
Score on train: 0.85889 (+/- 0.01)
Micro f1 score: 0.8651122248855961

Results on the data set:
              precision    recall  f1-score   support

           0       0.88      0.87      0.87      2492
           1       0.84      0.86      0.85      2097

    accuracy                           0.87      4589
   macro avg       0.86      0.87      0.86      4589
weighted avg       0.87      0.87      0.87      4589

DecisionTree LogisticRegression and ExtraTrees¶

In [71]:
def transform_data_train(X, age=False):
        cat_vars = ['Name', 'Longevity', 'TypeTravel', 'RoomType']
        num_vars = list(set(X.columns) - set(cat_vars))
        X_train_1 = X.drop(['Name'], axis=1)
        X_train_1['Longevity'] = ['yes' if (i == 'yes' or i == 'y') else 'no' for i in X_train_1['Longevity']]
        if 'Churn' in X_train_1.columns:
            X_train_1.drop(['Churn'], axis=1, inplace=True)
            
        X_train_2 = pd.get_dummies(X_train_1, drop_first = True)

        # fill missing values (KNN Imputer for Year of Birth: first variable of numerical variables)
        k_imputer = round(np.sqrt(len(X_train_2)),0).astype('int32') # 125
        imputer = KNNImputer(n_neighbors=k_imputer, weights="uniform", metric='nan_euclidean')
        imputer.fit(X_train_2)
        data_KNN_train = imputer.transform(X_train_2)
        data_KNN_train = pd.DataFrame(data_KNN_train)
        data_KNN_train[0] = data_KNN_train[0].round(0)
        X_train_2['Year_Birth'] = data_KNN_train[0].values
        if age:
            X_train_2['Age'] = date.today().year - X_train_2['Year_Birth']
            X_train_2 = X_train_2.drop('Year_Birth', axis=1)
        
        # Apply scaling to numerical data
        scaler = MinMaxScaler().fit(X_train_2)
        X_train_scaled = pd.DataFrame(scaler.transform(X_train_2), columns = X_train_2.columns, index = X_train_2.index,) # MinMaxScaler in the training data
        
        return X_train_scaled
In [72]:
models ={}
models["DecisionTree"] = DecisionTreeClassifier()
models["LogisticRegression"] = LogisticRegression()
models["ExtraTrees"] = ExtraTreesClassifier()
In [73]:
X = train_data_original.iloc[:,1:]
y = train_data_original.loc[:,'Churn']

X_train = transform_data_train(X, age=True)
print(set(X_train.columns) - set(transform_data_train(X, age=True).columns))

test_data_original = pd.read_csv('test.csv')

X_2 = test_data_original.iloc[:,1:]

X_test = transform_data_train(X_2, age=True)
set()

Decision Tree¶

In [74]:
cv = StratifiedKFold(n_splits=5)
param_grid = {
    "criterion": ["gini", "entropy", "log_loss"], # [320, 340, 360, 380, 400],
    "max_depth": [32, None, 10000, 12000],
    "max_features": [0.9909267486266218, 0.88],
    "min_samples_leaf": [5, 3, 1],
    "min_samples_split": [12, 6, 2]# [25, 30, 32, 34, 38, 45]
}
searchCV = GridSearchCV(estimator=models["DecisionTree"], scoring='accuracy', cv=cv, param_grid=param_grid, verbose=True)

searchCV.fit(X_train, y)

print('Best index:', searchCV.best_index_)
print('Best score:', searchCV.best_score_)
print('Best params:', searchCV.best_params_)
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best index: 117
Best score: 0.9171850122329841
Best params: {'criterion': 'entropy', 'max_depth': 10000, 'max_features': 0.88, 'min_samples_leaf': 5, 'min_samples_split': 12}

Logistic Regression¶

In [75]:
selected = ["OnlineBooking", "BarService", "Age", "Comfort", "Staff"]
#To drop ['RewardPoints','ReceptionSchedule', 'Staff', 'OnlineBooking', 'RoomSpace', 'BarService']
X_train_selected = X_train[selected]
X_test_selected = X_test[selected]
In [76]:
from scipy.stats import loguniform

#Decision Tree
cv = StratifiedKFold(n_splits=5)

param_grid = {
    "solver": ['newton-cg', 'lbfgs', 'liblinear'], # [320, 340, 360, 380, 400],
    "penalty": ['none', 'l1', 'l2', 'elasticnet'],
    "C": list(loguniform.rvs(1e-5, 100, size=30))
}
searchCV = GridSearchCV(estimator=models["LogisticRegression"], scoring='accuracy', cv=cv, param_grid=param_grid, verbose=True)

# WARNING: This could take some time to run.
searchCV.fit(X_train_selected, y)

print('Best index:', searchCV.best_index_)
print('Best score:', searchCV.best_score_)
print('Best params:', searchCV.best_params_)
Fitting 5 folds for each of 360 candidates, totalling 1800 fits
Best index: 149
Best score: 0.749694293722912
Best params: {'C': 0.007376306744870614, 'penalty': 'l1', 'solver': 'liblinear'}

ExtraTrees¶

In [77]:
cv = StratifiedKFold(n_splits=5)
param_grid = {
    "bootstrap": [True, False],   # [0, 1, 2, 3, 4],
    "criterion": ["gini", "entropy", "log_loss"], # [320, 340, 360, 380, 400],
    "max_depth": [32, None, 10000, 12000],
    "max_features": [0.9909267486266218, 0.88],
    "min_samples_leaf": [5, 3, 1],
    "min_samples_split": [12, 6, 2]# [25, 30, 32, 34, 38, 45]
}
searchCV = GridSearchCV(estimator=models["ExtraTrees"], scoring='accuracy', cv=cv, param_grid=param_grid, verbose=True)

# WARNING: This could take some time to run.
searchCV.fit(X_train, y)

print('Best index:', searchCV.best_index_)
print('Best score:', searchCV.best_score_)
print('Best params:', searchCV.best_params_)
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
Best index: 232
Best score: 0.9462441785544439
Best params: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 32, 'max_features': 0.88, 'min_samples_leaf': 1, 'min_samples_split': 6}
In [78]:
best_so_far = {'bootstrap': False, 'criterion': 'gini', 'max_depth': 12000, 'max_features': 0.88, 'min_samples_leaf': 1, 'min_samples_split': 6}
In [79]:
X_train.columns, X_test.columns
Out[79]:
(Index(['RewardPoints', 'Comfort', 'ReceptionSchedule', 'FoodDrink', 'Location',
        'Wifi', 'Amenities', 'Staff', 'OnlineBooking', 'PriceQuality',
        'RoomSpace', 'CheckOut', 'Checkin', 'Cleanliness', 'BarService',
        'Longevity_yes', 'TypeTravel_leisure', 'RoomType_single',
        'RoomType_suite', 'Age'],
       dtype='object'),
 Index(['RewardPoints', 'Comfort', 'ReceptionSchedule', 'FoodDrink', 'Location',
        'Wifi', 'Amenities', 'Staff', 'OnlineBooking', 'PriceQuality',
        'RoomSpace', 'CheckOut', 'Checkin', 'Cleanliness', 'BarService',
        'Longevity_yes', 'TypeTravel_leisure', 'RoomType_single',
        'RoomType_suite', 'Age'],
       dtype='object'))
In [80]:
X = train_data_original.iloc[:,1:]
y = train_data_original.loc[:,'Churn']

X_train = transform_data_train(X, age=True)

test_data_original = pd.read_csv('test.csv')

X_2 = test_data_original.iloc[:,1:]

X_test = transform_data_train(X_2, age=True)

test_data_original = pd.read_csv('test.csv')
delivery_model_3 = ExtraTreesClassifier(**best_so_far)
delivery_model_3.fit(X_train, y)
yhat = delivery_model_3.predict(X_test)
In [ ]:
sub = pd.DataFrame(data=yhat, columns=["Churn"])
sub["Cust_ID"] = test_data_original["Cust_ID"]
sub.set_index("Cust_ID", inplace=True)
sub['Churn'] = [1 if (i == 'churn') else 0 for i in sub['Churn']]

CREATIVITY AUTO_SKLEARN¶

In [ ]:
X = train_data_original.iloc[:,1:]
y = train_data_original.loc[:,'Churn']

X_train = transform_data_train(X, age=True)

test_data_original = pd.read_csv('test.csv')

X_2 = test_data_original.iloc[:,1:]

X_test = transform_data_train(X_2, age=True)
In [ ]:
import (accuracy,
         f1,
         roc_auc,
         precision,
         average_precision,
         recall,
         log_loss)
In [ ]:
#this package only work on unix based so, I've used docker to run it, will sent a tutorial attached
from autosklearn.classification import AutoSklearnClassifier
import autosklearn.metrics as skm 
In [ ]:
cv = StratifiedKFold(n_splits=5)
  
autoML_classifier = AutoSklearnClassifier(time_left_for_this_task=650,
                            max_models_on_disc=6,
                            resampling_strategy=cv,
                            ensemble_size = 4,
                            metric = skm.accuracy,
                            scoring_functions=[skm.accuracy, skm.roc_auc, skm.average_precision, skm.f1, skm.precision, skm.recall])

autoML_classifier.fit(X = X_train, y = y)
In [ ]:
autoML_classifier.leaderboard(detailed = True, ensemble_only=False)
In [ ]:
autoML_classifier.sprint_statistics()

Submission¶

In [ ]:
df_test=df_test[df_data.columns]
#the testing dataframe is cropped: only the columns used in
#training are left
y_final =  clf.predict(df_test)
#predicts the target for the test data
df_test['Churn']=y_final.copy()
df_test[['Churn']].to_csv('sub.csv')
#writes the results of the prediction in a csv file